Final Implementation of the Project

Exploratory Data Analaysis

# Importing all the required libraries
library(tidyverse)
library(ggplot2)
library(plotly)
library(ipumsr)
library(haven)
library(splitstackshape)
# A. Loading the data and creating a dataframe
df_ddi <- read_ipums_ddi("Dataset/nhis_00001.xml")
df <- as.data.frame(read_ipums_micro(df_ddi, verbose = FALSE))
df %>% head(5)
##   YEAR SERIAL STRATA PSU        NHISHID REGION URBRRL PERNUM          NHISPID
## 1 2020      1    150  25 0002020H000002      3      2      1 0002020H00000210
## 2 2020      2    111  10 0002020H000003      4      3      1 0002020H00000310
## 3 2020      3    133   3 0002020H000004      2      2      1 0002020H00000410
## 4 2020      4    139  45 0002020H000007      3      3      1 0002020H00000710
## 5 2020      5    130  21 0002020H000009      3      1      1 0002020H00000910
##       HHX SAMPWEIGHT LONGWEIGHT PARTWEIGHT ASTATFLG CSTATFLG AGE SEX SEXORIEN
## 1 H000002   5946.002   17605.50      0.000        1        0  56   2        2
## 2 H000003   6288.726       0.00   5853.664        1        0  66   1        2
## 3 H000004   6083.271       0.00   5814.397        1        0  59   2        2
## 4 H000007  11306.962       0.00  10626.550        1        0  56   1        2
## 5 H000009   6471.818   19317.18      0.000        1        0  48   2        2
##   MARSTAT MARST FAMSIZE PARTNEREMP FAMKIDNO RACEA ARMFEV EDUC SPOUSEDUC
## 1      10    11       3          0        1   100     11  501         3
## 2      10    11       2          0        0   100     11  501        10
## 3      10    11       2          0        0   100     11  502         9
## 4      30    30       1          0        0   100     11  103         0
## 5      99    99       4          0        1   100     98  400         0
##   SCHOOLNOW EMPSTAT HOURSWRK PAIDSICK EMPHI EMPFT INCFAM07ON FAMTOTINC USUALPL
## 1         1     200        0        0     0     0         11     27000       2
## 2         1     100       40        2     2     2         24    250000       3
## 3         1     100       40        1     1     2         24    150000       2
## 4         1     999        0        0     0     0         22     54000       2
## 5         8     999        0        0     0     0         23     95000       2
##   HINOTCOVE CVDDIAG CVDTEST CVDTESTRSLT
## 1         1       1       1           0
## 2         1       0       0           0
## 3         1       0       0           0
## 4         1       0       0           0
## 5         2       1       1           0
# B. Preliminary check on the data
#  a. Review classes
sapply(df, class)
## $YEAR
## [1] "numeric"
## 
## $SERIAL
## [1] "numeric"
## 
## $STRATA
## [1] "haven_labelled" "vctrs_vctr"     "double"        
## 
## $PSU
## [1] "haven_labelled" "vctrs_vctr"     "double"        
## 
## $NHISHID
## [1] "character"
## 
## $REGION
## [1] "haven_labelled" "vctrs_vctr"     "integer"       
## 
## $URBRRL
## [1] "haven_labelled" "vctrs_vctr"     "integer"       
## 
## $PERNUM
## [1] "numeric"
## 
## $NHISPID
## [1] "character"
## 
## $HHX
## [1] "character"
## 
## $SAMPWEIGHT
## [1] "numeric"
## 
## $LONGWEIGHT
## [1] "numeric"
## 
## $PARTWEIGHT
## [1] "numeric"
## 
## $ASTATFLG
## [1] "haven_labelled" "vctrs_vctr"     "integer"       
## 
## $CSTATFLG
## [1] "haven_labelled" "vctrs_vctr"     "integer"       
## 
## $AGE
## [1] "haven_labelled" "vctrs_vctr"     "double"        
## 
## $SEX
## [1] "haven_labelled" "vctrs_vctr"     "integer"       
## 
## $SEXORIEN
## [1] "haven_labelled" "vctrs_vctr"     "integer"       
## 
## $MARSTAT
## [1] "haven_labelled" "vctrs_vctr"     "integer"       
## 
## $MARST
## [1] "haven_labelled" "vctrs_vctr"     "integer"       
## 
## $FAMSIZE
## [1] "haven_labelled" "vctrs_vctr"     "double"        
## 
## $PARTNEREMP
## [1] "haven_labelled" "vctrs_vctr"     "integer"       
## 
## $FAMKIDNO
## [1] "haven_labelled" "vctrs_vctr"     "double"        
## 
## $RACEA
## [1] "haven_labelled" "vctrs_vctr"     "integer"       
## 
## $ARMFEV
## [1] "haven_labelled" "vctrs_vctr"     "integer"       
## 
## $EDUC
## [1] "haven_labelled" "vctrs_vctr"     "integer"       
## 
## $SPOUSEDUC
## [1] "haven_labelled" "vctrs_vctr"     "integer"       
## 
## $SCHOOLNOW
## [1] "haven_labelled" "vctrs_vctr"     "integer"       
## 
## $EMPSTAT
## [1] "haven_labelled" "vctrs_vctr"     "integer"       
## 
## $HOURSWRK
## [1] "haven_labelled" "vctrs_vctr"     "integer"       
## 
## $PAIDSICK
## [1] "haven_labelled" "vctrs_vctr"     "integer"       
## 
## $EMPHI
## [1] "haven_labelled" "vctrs_vctr"     "integer"       
## 
## $EMPFT
## [1] "haven_labelled" "vctrs_vctr"     "integer"       
## 
## $INCFAM07ON
## [1] "haven_labelled" "vctrs_vctr"     "integer"       
## 
## $FAMTOTINC
## [1] "haven_labelled" "vctrs_vctr"     "double"        
## 
## $USUALPL
## [1] "haven_labelled" "vctrs_vctr"     "integer"       
## 
## $HINOTCOVE
## [1] "haven_labelled" "vctrs_vctr"     "integer"       
## 
## $CVDDIAG
## [1] "haven_labelled" "vctrs_vctr"     "integer"       
## 
## $CVDTEST
## [1] "haven_labelled" "vctrs_vctr"     "integer"       
## 
## $CVDTESTRSLT
## [1] "haven_labelled" "vctrs_vctr"     "integer"
# Note: some of the variables have attached data definitions "haven labeled"

# b. Check for missing values
sapply(df, function(x) sum(is.na(x)))
##        YEAR      SERIAL      STRATA         PSU     NHISHID      REGION 
##           0           0           0           0           0           0 
##      URBRRL      PERNUM     NHISPID         HHX  SAMPWEIGHT  LONGWEIGHT 
##           0           0           0           0           0           0 
##  PARTWEIGHT    ASTATFLG    CSTATFLG         AGE         SEX    SEXORIEN 
##           0           0           0           0           0           0 
##     MARSTAT       MARST     FAMSIZE  PARTNEREMP    FAMKIDNO       RACEA 
##           0           0           0           0           0           0 
##      ARMFEV        EDUC   SPOUSEDUC   SCHOOLNOW     EMPSTAT    HOURSWRK 
##           0           0           0           0           0           0 
##    PAIDSICK       EMPHI       EMPFT  INCFAM07ON   FAMTOTINC     USUALPL 
##           0           0           0           0           0           0 
##   HINOTCOVE     CVDDIAG     CVDTEST CVDTESTRSLT 
##           0           0           0           0
#c. Dropping variables that are not that important or provide no inisghts in the data
droppable_cols <- c("YEAR","SERIAL","STRATA","PSU","NHISHID","NHISPID","HHX","SAMPWEIGHT","LONGWEIGHT","PARTWEIGHT")
df <- df %>% dplyr::select(-all_of(droppable_cols))
colnames(df)
##  [1] "REGION"      "URBRRL"      "PERNUM"      "ASTATFLG"    "CSTATFLG"   
##  [6] "AGE"         "SEX"         "SEXORIEN"    "MARSTAT"     "MARST"      
## [11] "FAMSIZE"     "PARTNEREMP"  "FAMKIDNO"    "RACEA"       "ARMFEV"     
## [16] "EDUC"        "SPOUSEDUC"   "SCHOOLNOW"   "EMPSTAT"     "HOURSWRK"   
## [21] "PAIDSICK"    "EMPHI"       "EMPFT"       "INCFAM07ON"  "FAMTOTINC"  
## [26] "USUALPL"     "HINOTCOVE"   "CVDDIAG"     "CVDTEST"     "CVDTESTRSLT"
# Feature Engineering variables 
# REGION, URBRRL PERNUM ASTATFLG CSTATFLG AGE FAMKIDNO dont need any feature engineering yet

# a. Handling SEX AND SEXORIEN variable
# Combining all 'unknown categories' into one
unk <- c(7,8,9)
df$SEX[df$SEX %in% unk] = 9

# Combining Unknown SEXORIEN into "Something else" category
unk <- c(5,7,8)
df$SEXORIEN[df$SEXORIEN %in% unk] = 4

# b. Handling RACEA Variable
# Combining all 'unknown categories' into one
unk <- c(580, 900, 970, 980, 990)
df$RACEA[df$RACEA %in% unk] = 900

#c. Handling MARSTAT & MARST Variable
# Since both of the variables are indicative of same features it makes sense to keep on their current marital status for this analysis.
df <- df %>% select(-MARSTAT)

# Combining NIU and Unknown into one and combining all married labels into one
unk <- c(0,99)
df$MARST[df$MARST %in% unk] = 99

marr <- c(10,11,12,13)
df$MARST[df$MARST %in% marr] = 10

# d. Handling FAMSIZE 
# Combine Unknowns into one
unk <- c(98,99)
df$FAMSIZE[df$FAMSIZE %in% unk] = 98

# e. Handling PARTNEREMP
unk <- c(7,8,9)
df$PARTNEREMP[df$PARTNEREMP %in% unk] = 9

# f. Handling ARMFEV by combining all unknowns
unk <- c(97,98,99)
df$ARMFEV[df$ARMFEV %in% unk] = 99

# g. Handling EDUC, SPOUSEDUC, SCHOOLNOW, by combining all unknowns
unk <- c(996,997,998,999)
df$EDUC[df$EDUC %in% unk] = 996

unk <- c(97,98,99)
df$SPOUSEDUC[df$SPOUSEDUC %in% unk] = 99

unk <- c(7,8,9)
df$SCHOOLNOW[df$SCHOOLNOW %in% unk] = 9

#h. Handling Employment Status  
# Table below shows the distribution of employment categories
#Labels:
#value                                            label
#     0                                              NIU
#   100                                         Employed
#   110                                          Working
#   111                  Working for pay at job/business
#   112              Working, w/out pay, at job/business
#   120                        With job, but not at work
#   121 With job, not at work: not laid-off, not looking
#   122                   With job, not at work: looking
#   200                                     Not employed
#   210                                       Unemployed
#   211                            Unemployed: On layoff
#   212                Unemployed: On layoff and looking
#   213           Unemployed: Unk if looking or laid off
#   214                 Unemployed: Looking or on layoff
#   215                Unemployed: Have job to return to
#   216             Unemployed: Had job during the round
#   217       Unemployed: No job during reference period
#   220                               Not in labor force
#   900                               Unknown-all causes
#   997                                  Unknown-refused
#   998                          Unknown-not ascertained
#   999                               Unknown-don't know

# Combining all working into one, with job into one, Unemployed into one and unknown into one
work <- c(110,111,112)
df$EMPSTAT[df$EMPSTAT %in% work] = 110
w_job <- c(120,121,122)
df$EMPSTAT[df$EMPSTAT %in% w_job] = 120
unemployed <- c(200,210,211:217)
df$EMPSTAT[df$EMPSTAT %in% unemployed] = 200
unk <- c(997:999)
df$EMPSTAT[df$EMPSTAT %in% unk] = 999

# i. Handling HOURSWRK by replacing number of hours unknown into 0
unk <- c(0,97:99)
df$HOURSWRK[df$HOURSWRK %in% unk] = 0

# j. Handling all variables remaining 
# Combined unknowns into one
df <- df %>% mutate(PAIDSICK = replace(PAIDSICK,PAIDSICK >4,9))  

# Mutating EMPHI
df <- df %>% mutate(EMPHI = replace(EMPHI,EMPHI >4,9))  

# Mutating USUALPL
# value                         label
#     0                           NIU
#     1       There is no place or No
#     2 Yes, has a usual place or Yes
#     3  There is more than one place
#     7               Unknown-refused
#     8       Unknown-not ascertained
#     9            Unknown-don't know

# 2-3 combined as 2 and 7,8,9 combined as 3

df <- df %>% 
    mutate(USUALPL = replace(USUALPL, USUALPL == 3,2))  

df <- df %>% 
    mutate(USUALPL = replace(USUALPL, USUALPL >= 7,9))  

# Mutating HINOTCOVE
# Combine all unknowns into one
df <- df %>% mutate(HINOTCOVE = replace(HINOTCOVE,HINOTCOVE >4,9))  

# Mutating CVDTEST
# Same transformation as above
df <- df %>% mutate(CVDTEST = replace(CVDTEST,CVDTEST >4,9))  

# Mutating CVDDIAG
# Same transformation as above
df <- df %>% mutate(CVDDIAG = replace(CVDDIAG,CVDDIAG >4,9))  

# Mutating CVDTESTRSLTS
#Labels:
# value                   label
#     0                     NIU
#     1                      No
#     2                     Yes
#     3 Did not receive results
#     7         Unknown-refused
#     8 Unknown-not ascertained
#     9      Unknown-don't know

#Similar transformation where we combine all unknowns to 9
df <- df %>% mutate(CVDTESTRSLT = replace(CVDTESTRSLT,CVDTESTRSLT >4,9))  

Data Visualization on the clean data

# A. Checking for health insurance coverage and covid relation
counts_hinc <- table(df$HINOTCOVE, df$CVDTESTRSLT)

rownames(counts_hinc) <- c('Has coverage','No coverage','Unknown')
colnames(counts_hinc) <- c('NIU', 'Negative','Positive',"Did Not Receive results",'Unknown')

barplot(counts_hinc[,2:3], col = rainbow(3:6),
        main = 'Test Results based on Insurance Coverage', legend = rownames(counts_hinc)[1:2], xlab = 'COVID Test Result')

# B. Seeing the number of people who got tested given they have health insurance coverage from the company
sample_df <- df %>% group_by(CVDTEST = as.factor(CVDTEST),EMPHI = as.factor(EMPHI)) %>% dplyr::summarize(count = n())
## `summarise()` has grouped output by 'CVDTEST'. You can override using the `.groups` argument.
ggplot(sample_df, aes(fill=CVDTEST, y=count, x=CVDTEST)) + 
    geom_bar(position="dodge", stat="identity") +
    ggtitle("People who tested when they get Health insurance coverage from the Company") +
    facet_wrap(~EMPHI, ncol = 4, labeller = labeller(CVDTEST = c("0" = "NIU","1" =  "Not Covered by Company", "2" = "Covered by Company", "9" = "Unknown" ))) +
    theme(legend.position="none") +
    xlab("Covid Test") + ylab("Frequency") + scale_x_discrete(labels = c("NIU","Negative","Positive","No Results")) + scale_fill_manual(values = c("#A6CEE3","#1F78B4","#B2DF8A","#33A02C")) + theme_minimal() + theme(plot.title = element_text(hjust = 0.5),axis.text.x = element_text(angle = 30))

# C. Seeing results between getting sick leave and covid result tests
counts_ps <- table(df$PAIDSICK, df$CVDTESTRSLT)
rownames(counts_ps) <- c('NIU','No sick leave','Sick leave', 'Unknown')
colnames(counts_ps) <- c('NIU', 'Negative','Positive', "Did Not Receive Results",'Unknown')


barplot(counts_ps[2:3,2:3], col = rainbow(2), 
        legend = rownames(counts_ps)[2:3], main = 'Test Results based on having Sick Leave', xlab = 'Testing Opportunity')

# D. Does having a usual place for medical care affect the testing
sample_df <- df %>% filter(USUALPL == 2)
sample_df <- sample_df %>% group_by(CVDTEST) %>% summarise(n = n())
sample_df %>% head()
## # A tibble: 4 x 2
##                  CVDTEST     n
##                <int+lbl> <int>
## 1 0 [NIU]                16418
## 2 1 [No]                 12956
## 3 2 [Yes]                 5236
## 4 9 [Unknown-don't know]    33
fig <- plot_ly(sample_df, x = ~CVDTEST, y = ~n, type = 'bar',
        marker = list(color = c('rgba(204,204,204,1)', 'rgba(204,204,204,1)', 'rgba(222,45,38,0.8)','rgba(204,204,204,1)')))

fig <- fig %>% layout(title = "Seeing if having Usual Place of medical care encourages Covid Testing",
         xaxis = list(title = ""),
         yaxis = list(title = ""))

fig

Data Modelling

Question 1

How does socioeconomic status affect COVID-19 infection status?

# A. Creating Stratified Sample
table(df$CVDDIAG)
## 
##     0     1     2     9 
## 17649 18954   671    84
strat <- stratified(df, group = 'CVDDIAG', size = 1500)
## Groups 2, 9 contain fewer rows than requested. Returning all rows.
# Dropping the NIUs & Unknown as the provide no insights into the data
strat <- strat %>% filter(CVDDIAG != 0)
strat <- strat %>% filter(CVDDIAG != 9)
strat$CVDDIAG[strat$CVDDIAG == 1] = 0
strat$CVDDIAG[strat$CVDDIAG == 2] = 1

# Seeing the distribution of the dataset
table(strat$CVDDIAG)/length(strat$CVDDIAG)
## 
##         0         1 
## 0.6909258 0.3090742
# Visualising it
barplot(table(strat$CVDDIAG))

# Fitting a logi

Question 2

What are the driving factors behind infections?

Question 3

What type of working conditions impacts the ability to get tested for COVID?